1 Preparing the environment for script execution

  1. Download maxent.jar and place it in the same folder as this document
  2. Download and install R
  3. Download and install RTools 3.5
  4. Download and install RStudio
  5. Install and load the devtools package (run the commands using the RStudio console)
    • install.packages("devtools")
    • library(devtools)
  6. Install the sdm package and the packages it depends on (run the commands using the RStudio console)
    • install.packages("sdm")
    • library(sdm)
    • installAll()
  7. Install all other required packages (run commands using RStudio console)
    • install.packages(c("raster", "rgdal", "sf", "rgeos", "ggplot2", "cowplot", "plotly", "mapview", "ggfortify", "scales", "factoextra ", "ggcorrplot", "prettyGraphs", "Rtsne", "DT", "parallel", "snow", "sdm", "FactoMineR", "rdist", "usdm", "dplyr", "tidyr", "purrr", "purrrlyr", "here", "fs", "stringr", "exactextractr", "gdalUtils", "dismo", "rgbif", "boot"))

2 Downloading data:

2.1 Downloading rasters

First we create a folder in our working directory to include input data.

# Create folder to store inputs.
if(!dir_exists('input_data')){
  dir_create('input_data')
}

Now let’s download data from WorldClim 2.1. Note that when R downloads a file it has a timeout of 60 seconds. This may not be enough to download environmental data, so we can set options(timeout=n), where n is the number of seconds we need to download the data.

# This option allows us to control how much time do we need to download the data. If R takes more than 10 minutes (600 seconds) to download the data it will stop the download. Increase the timeout if needed.
options(timeout=600)

# Current data
# Files are automatically saved in input_data folder.
#WorldClim_data('current', variable = 'bioc', resolution = 10)

# Future data
gcms <- c("ac", "ae", "bc", "ca", "cc", "ce", "cn", 
        "ch", "cr", "ec", "ev", "fi", "gg", "gh", "hg", 
        "in", "ic", "ip", "me", "mi", "mp", "ml", "mr", "uk")
# WorldClim_data('future', variable = 'bioc', year = '2090', gcm = gcms, ssp = c('585'), resolution = 10) 

2.2 Obtaining occurrence data from GBIF:

# Downloading data from GBIF
# File is automatically saved in input_data folder
# spp_data <- GBIF_data('Colossoma macropomum')

2.3 We will use our own data:

spp_data <- here('input_data/spp_data.csv')

2.4 Downloading shapefile for study area

# Obtaining Natural Earth data:
#shape_study_area <- rnaturalearth::ne_download(scale = 50, type = "rivers_lake_centerlines", category = "physical")

3 Geoprocessing:

3.1 Open Files and Data

Firstly, we name inputs and outputs, caring for using the correct extensions. a) Inputs:

# Shapefile (polygon or lines) delimiting study area.
shape_study_area <- here("input_data/shape_study_area/AmazonHydroRivers4.shp")  

# Directory name containing current rasters to be rescaled.
folder_current_rasters <- here("input_data/WorldClim_data_current")

# Directory name containing future rasters to be rescaled.
folder_future_rasters <- here("input_data/WorldClim_data_future")
  1. Outputs:
# Name of shapefile (.shp) for the study area to be saved.
output_shp_study_area <- here("output_data/grid/Colossoma_grid.shp")

# Name of R object (.rds) where the current rescaled variables will be saved.
output_shp_current <- here("output_data/WorldClim_data_current_rescaled/Colossoma_current.rds")
  1. Seting up some important variables:
# Cell hight and width for the grid.
# This value depends on rasters projection. If rasters are in UTM, values will be in meters. If rasters are in decimal degrees (as in WorldClim 2.1), values will be in degrees. However, note that the function make_grid (used to build the grid above study area) has an argument called epsg where we can reproject spatial data. The epsg of study area is further transmitted to predictor variables. This means that even if WorldClim 2.1 is projected in decimal degrees we should address cell sizes in the desired epsg.

# Following values build a grid with 100 x 100 km.
cell_width = 100000
cell_height = 100000
# Note that setting those values to build a grid smaller than the input rasters may generate NaNs, causing some problems.

# If you have any variable in shape_study_area that you want to keep for rescaling, you can set here.
# Set the correct names of variables using as much as 10 characters.
# Setting the names to list() will use none of the variables in the shape, while setting it to NULL will use all variables.
names_var_shp_study_area <-  list()

# As in the codeline above, here we set which variables in current rasters we want to keep for rescaling.
# Set the correct names of variables using as much as 10 characters.
# Setting the names to list() will use none of the variables in the shape, while setting it to NULL will use all variables.
current_var_names <- paste0('bio_', 1:19) # or NULL

# As in the codelines above, here we set which variables in future rasters we want to keep for rescaling.
# We will usually need at least the same variables as in the current scenario for projection.
# Set the correct names of variables using as much as 10 characters.
# Setting the names to list() will use none of the variables in the shape, while setting it to NULL will use all variables.
future_var_names <-  current_var_names 

3.2 Study Area

The map of study area needs to be imported to R, so we can create a grid for the study area. This grid will be used for model building and projections.

shape_study_area %<>%
  st_read() %>%
  repair_shp()
## Reading layer `AmazonHydroRivers4' from data source 
##   `/Users/luizesser/Documents/Post-Doc/NAPI-EC/ForestFisher/Modelagens/input_data/shape_study_area/AmazonHydroRivers4.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 107294 features and 14 fields
## Geometry type: LINESTRING
## Dimension:     XY
## Bounding box:  xmin: -79.37708 ymin: -20.21042 xmax: -48.44375 ymax: 5.139583
## Geodetic CRS:  WGS 84
if (output_shp_study_area %>% file_exists()== F){
  grid_study_area <- shape_study_area %>% 
      make_grid(cell_width, cell_height, names_var_shp_study_area, epsg=6933)
  
  output_shp_study_area %>% 
    path_dir() %>% 
    dir_create()
  
  grid_study_area %>% st_write(
      dsn = output_shp_study_area)
} else {
  grid_study_area <- output_shp_study_area %>% st_read()
}
## Reading layer `Colossoma_grid' from data source 
##   `/Users/luizesser/Documents/Post-Doc/NAPI-EC/ForestFisher/Modelagens/output_data/grid/Colossoma_grid.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 642 features and 3 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -7600000 ymin: -2500000 xmax: -4600000 ymax: 7e+05
## Projected CRS: WGS 84 / NSIDC EASE-Grid 2.0 Global

3.3 Compare GCMs

Comparing GCMs is important so we don’t need to project all the GCMs. We propose here an alternative method that calculates a k-means given the number of groups you want. First we need to rescale future variables. To make GCMs comparable, we use only one SSP and one time-period. It is only supported, so far, to use bio_1 and bio_12 (both are used to infer other bioclimatic variables).

### Setting inputs and outputs
scenarios_gcm <- 585
gcm_names <- gcms

## Exclude: MIROC_ESM, MIROC_ESM_CHEM, MIROC5 and CSIRO_MK3_6_0 (CMIP5, not CMIP6)

#gcms_result <- compare_gcms(folder_future_rasters, grid_study_area, var_names=c('bio_1','bio_12'), gcm_names, scenarios_gcm, k=5)

#print(gcms_result)

3.4 Rescaling variables

The next step aims to cross data from study area with rasters of variables. We will start with current data.

### Rescaling current data
if (output_shp_current %>% file_exists() == F) {
  grid_current <- grid_study_area %>% 
    add_raster(folder_current_rasters, current_var_names) 
    
  output_shp_current %>% 
    path_dir() %>% 
    dir_create()
  
  grid_current %>% saveRDS(output_shp_current)
} else {
  grid_current <- output_shp_current %>% readRDS()
}

Now within a loop to rescale future variables.

# Downloading future data for selected gcms:
# WorldClim_data('future', variable = 'bioc', year = c('2030','2050','2070','2090'), 
#                gcm =gcms_result$suggested_gcms, ssp = c('245','585'), resolution = 10) 

# Name of R objects (.rds) where the future rescaled variables will be saved.
ssps <- c('ssp245', 'ssp585') # Scenarios names
#gcms <- gcms_result$suggested_gcms
gcms <- c('ac', 'bc', 'ca', 'ev', 'mr')
scenarios <- apply(expand.grid(gcms, ssps, 10, c(2030,2050,2070,2090)), 1, paste, collapse="_")
output_shp_future <- here(paste0("output_data/WorldClim_data_future_rescaled/",
                                 scenarios,
                                 ".rds"))

### Rescaling future data
if (!all(output_shp_future %>% file_exists())) {
  for (i in 1:length(scenarios)) {

      grid_future <- grid_study_area %>% 
       add_raster(folder_future_rasters, future_var_names, scenarios[i]) 
      
      output_shp_future %>% 
       path_dir() %>% 
       dir_create()
      
      grid_future %>% saveRDS(output_shp_future[grep(scenarios[i], output_shp_future)])
    
  }
}

grid_future <- lapply(output_shp_future,function(x){readRDS(x)})
names(grid_future) <- scenarios

4 Occurrence data

4.1 Open files with data

It is necessary to name the output. Be extra careful with extension names. a) Output:

#  Set the path to the output shapefile, which will contain the presence/absence matrix.
spp_output <- here("output_data/spp_pa.shp")

It is also necessary to set some other important parameters.

# Species names to be used in the study. 
# Names should be identical from input/spp_data.csv obtained previously.
# Setting this to NULL will use all species.
spp_names <- read.csv(spp_data)$species %>% table() %>% names()# or NULL

# Set which CRS projection is the grid_study_area.
crs_occ_data <- "+init=epsg:4326"

4.2 Importing occurrence data

occ_shp <- spp_data %>% 
  occurrences_to_shapefile(spp_names, grid_study_area, CRS(crs_occ_data))

plot(occ_shp)

4.3 Occurrence grid for the Study Area

We will say to the grid_study_area which cells have an occurrence record for the studied species.

if (spp_output %>% file_exists() == F) {
  grid_matrix_pa <- occ_shp %>% 
    occurrences_to_pa_shapefile(grid_study_area, spp_names)
  
  spp_output %>% 
    path_dir() %>% 
    dir_create()
  
  grid_matrix_pa %<>% select(-c('x_centroid', 'y_centroid', 'cell_id'))
  
  grid_matrix_pa %>% st_write(dsn = spp_output)
} else {
  grid_matrix_pa <- spp_output %>% st_read()
}
## Reading layer `spp_pa' from data source 
##   `/Users/luizesser/Documents/Post-Doc/NAPI-EC/ForestFisher/Modelagens/output_data/spp_pa.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 642 features and 33 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -7600000 ymin: -2500000 xmax: -4600000 ymax: 7e+05
## Projected CRS: WGS 84 / NSIDC EASE-Grid 2.0 Global
grid_matrix_pa %>% 
  mutate(sum = rowSums(across(where(is.numeric)))) %>%
  mutate(across(sum, as.factor)) %>%
  ggplot() +
      geom_sf(aes(fill = sum), color=NA) +
      scale_fill_brewer(palette = "YlGnBu")   
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette YlGnBu is 9
## Returning the palette you asked for with that many colors

Check how many records there is to each species:

sapply(colnames(grid_matrix_pa)[-ncol(grid_matrix_pa)], 
       function(x){
          sum(as.data.frame(grid_matrix_pa)[,x])},
       USE.NAMES = T)
## astrnts_cl achnptrch_ brycn_mznc brycn_flct brycn_mlnp clphyss_mc chlcs_ryth 
##         93         84         85        106        109        141         63 
## clssm_mcrp hmds_mmclt hplrythrn_ lprns_gssz lprns_brnn lprns_fsct lprns_frdr 
##         97         78        231        110         60        195        283 
## lprns_klsw lthdrs_drs mglprns_tr mtynns_hyp mylpls_str mylpls_rbr mylpls_trq 
##         29         20         71         82        111        153         60 
## mylossm_rm oxydrs_ngr phrctcphl_ prcts_brch pmlds_blch pltynmtch_ ptrdrs_grn 
##         97        108         89         84        227         55         94 
## schzdn_fsc srrslms_gl srrslms_sp trprths_lb trprths_ng 
##        164         53        117        208        192

5 Variable Selection - VIF

Variable Selection must be done OR with a VIF routine, OR with a PCA. To perform a VIF routine, we will use only the environmental data from the occurrence data.

# Obtain occurrence data.frame and environmental variables:
df_vif <- get_df_vif(grid_current, grid_matrix_pa)
## var_names not informed. Variables detected:  bio_1, bio_2, bio_3, bio_4, bio_5, bio_6, bio_7, bio_8, bio_9, bio_10, bio_11, bio_12, bio_13, bio_14, bio_15, bio_16, bio_17, bio_18, bio_19sp_names not informed. Detected species:  astrnts_cl, achnptrch_, brycn_mznc, brycn_flct, brycn_mlnp, clphyss_mc, chlcs_ryth, clssm_mcrp, hmds_mmclt, hplrythrn_, lprns_gssz, lprns_brnn, lprns_fsct, lprns_frdr, lprns_klsw, lthdrs_drs, mglprns_tr, mtynns_hyp, mylpls_str, mylpls_rbr, mylpls_trq, mylossm_rm, oxydrs_ngr, phrctcphl_, prcts_brch, pmlds_blch, pltynmtch_, ptrdrs_grn, schzdn_fsc, srrslms_gl, srrslms_sp, trprths_lb, trprths_ng
# Run VIF routine from usdm package:
vif_bio <- lapply(df_vif, function(x){vifcor(x,th=0.5)})
             
vif_bio[[1]]
## 15 variables from the 19 input variables have collinearity problem: 
##  
## bio_17 bio_10 bio_16 bio_11 bio_1 bio_6 bio_15 bio_5 bio_12 bio_7 bio_9 bio_3 bio_14 bio_19 bio_2 
## 
## After excluding the collinear variables, the linear correlation coefficients ranges between: 
## min correlation ( bio_8 ~ bio_4 ):  0.03229706 
## max correlation ( bio_18 ~ bio_4 ):  -0.2186792 
## 
## ---------- VIFs of the remained variables -------- 
##   Variables      VIF
## 1     bio_4 1.104933
## 2     bio_8 1.008966
## 3    bio_13 1.063787
## 4    bio_18 1.077752
# We can exclude variables with high VIF or go to the next step.
grid_current_vif <-  sapply(names(df_vif), function(x){select(grid_current, !vif_bio[[x]]@excluded)}, USE.NAMES = T)

6 Variable Selection - PCA

  1. Outputs:
# Name of the shapefile in which the PCA values will be stored.
shp_pca <- here("output_data/pca/shp_pca.rds")

# Name of the shapefile in which current PCA projection will be stored.
shp_preds <- here("output_data/pca/shp_preds.rds")

# Name of the shapefile in which future PCA projection will be stored.
shp_preds_future <- here("output_data/pca/shp_preds_future.rds")
  1. Transforming objects to be used in the next steps:
shp_matrix_pa <-  spp_output %>%
  st_read()
## Reading layer `spp_pa' from data source 
##   `/Users/luizesser/Documents/Post-Doc/NAPI-EC/ForestFisher/Modelagens/output_data/spp_pa.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 642 features and 33 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -7600000 ymin: -2500000 xmax: -4600000 ymax: 7e+05
## Projected CRS: WGS 84 / NSIDC EASE-Grid 2.0 Global
df_species <- shp_matrix_pa %>% 
  as.data.frame() %>%
  select(-c('geometry'))

df_var_preditors <- output_shp_current %>%
  get_predictors_as_df()

6.1 Control Variables

# Names of variables to be normalized (centered and scaled).
# Normalization can improve the modelling, the calculation of PCAs and the clusterization algorithms used to generate pseudoabsences.
var_normalization <- current_var_names # OR: paste0("bio_",1:19)

# Names of variables to be used in PCA.
var_pca_bio <- current_var_names # OR: paste0("bio_",1:19)

# Number of PCA-axes to be retained.
nr_comp_pca_bio <- 4

# Names of variables to be used as predictors when training the models. It can be environmental variables or PCA-axes.
var_predictors <-c("dim_1_bio","dim_2_bio","dim_3_bio","dim_4_bio")

6.2 Preparing Predictor Variables

6.2.1 Standardizing and normalizing

df_potential_predictors <- df_species %>%
  bind_cols(df_var_preditors)

df_potential_predictors <- df_potential_predictors %>% 
  center_scale(var_normalization)

df_potential_predictors %>% 
  head() %>% 
  round(4) %>% 
  datatable(options = list(pageLength = 10, scrollX=T))

6.3 Preparing Variables for PCA

df_var_pca <- df_potential_predictors %>% 
  select(var_pca_bio[which(var_pca_bio %in% colnames(df_potential_predictors))])

df_var_pca %>%
  head() %>% 
  round(4) %>% 
  datatable(options = list(pageLength = 10, scrollX=T))

6.3.1 PCA analysis for variables.

6.3.1.1 Correlation Matrix

df_var_pca %>% 
  corr_plot()
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

6.3.1.2 PCA analysis

The table that follows shows the values of the axes and variables. A tabela a seguir mostra os valores dos eixos e variaveis.

if (shp_pca %>% file_exists() == T) {
  file_delete(shp_pca)
}
pca_bio <- df_var_pca %>% 
  calc_pca(nr_comp_pca_bio, "bio")

pca_bio$var$loadings %>% 
  round(4) %>% 
  datatable(options = list(pageLength = 10, scrollX=T))
if(!dir_exists('output_data/pca/')){
  dir_create('output_data/pca/')
}

grid_study_area %>% 
  bind_cols(pca_bio$ind$coord %>% as.data.frame()) %>% 
  saveRDS(file = shp_pca)

Summary of dimensions. The following table shows the correlation between variables with axes (i.e. the significance from each variable to given axle).

pca_bio %>% 
  dt_pca_summ()

6.3.1.3 Variable Contribution

pca_bio %>% 
  contrib_scree()

pca_bio %>% 
  contrib_corr()

pca_bio %>% 
  contrib_dims()

6.3.1.4 Quality of Variables’ Representation

PCA-axes considering the quality of variables’ representation.

pca_bio %>% 
  pca_cos2()

Quality of variables’ representation on axes.

pca_bio %>% 
  cos2_dims()

pca_bio %>% 
  cos2_corr()

pca_bio %>% 
  pca_bi_plot(df_species %>% rowMeans() %>% ceiling())

pca_bio %>% 
  comp_pca_retain()
## $broken.stick
## [1] 2
## 
## $kaiser.mean
## [1] 4
## 
## $horn.p
## [1] 4

6.4 Generating Shapefile with Predictors

6.4.1 Jointing PCA to variables

df_potential_predictors <- df_potential_predictors %>%
  bind_cols(pca_bio$ind$coord %>% as.data.frame()) 

df_potential_predictors %>% 
  head() %>% 
  round(4) %>%
  datatable(options = list(pageLength = 10, scrollX=T))
if (shp_preds %>% file_exists() == F){
  df_var_preditors <- df_potential_predictors %>% 
    select(var_predictors %>% all_of())
  
  grid_study_area %>% 
    select(cell_id) %>% 
    bind_cols(df_var_preditors) %>% 
    saveRDS(file = shp_preds)
}

6.5 Projecting PCA-axes to the future scenarios.

pca_future <- proj_pca(pca_bio, grid_future)
#pca_future

pca_future %>% 
    saveRDS(file = shp_preds_future)

7 Training Models

7.1 Set Data

As in previous steps, it is necessary to set inputs and outputs, taking extra care with extension names. a) Input:

# Let's use the PCA-axes built in the last step:
df_var_preditors <- df_potential_predictors[,colnames(df_potential_predictors) %in% var_predictors]
grid_future <- pca_future
  1. Outputs:
# Name the directory to save trained models.
folder_models <- here("output_data/models")
  1. Control Variables:
# Algorithm names to be used in Species Distribution Modeling.
# Run getmethodNames() to unveil which algorithms are available.
algo <- c("svm", "rbf", "mda")

# Set the threshold criteria.
# 1:sp=se, 2:max(se+sp), 3:min(cost), 4:minROCdist, 5:max(kappa), 6:max(ppv+npv), 7:ppv=npv, 8:max(NMI), 9:max(ccr), 10: prevalence
thresh_criteria <- 2

# Number of runs to each algorithm
n_run <- 10

# Number of folds to crossvalidation
n_folds <- 4

# Number of pseudoabsence sets
n_pa <- 1

7.2 Generate Pseudoabsences

To build models, it is necessary to use pseudoabsences that contrast to presences. Currently, only the ‘random’ method is applied.

df_pseudoabsences <- shp_matrix_pa %>%
  pseudoabsences(df_var_preditors, spp_names, method="random", folder_models) 

It is possible to plot a t-SNE graph to check whether pseudoabsence data clusters into a separate group from presence data.

df_potential_predictors %>% 
  tsne_plot(df_pseudoabsences, spp_names[1])
## Warning: Using one column matrices in `filter()` was deprecated in dplyr 1.1.0.
## ℹ Please use one dimensional logical vectors instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

7.3 Join data

As we are using the sdm package, let’s start to build our models by indicating our input data.

d <- df_species %>% 
  fit_data(df_var_preditors, df_pseudoabsences)
#d

7.4 Training Models

With the data, we can build our models.

df_species %>%
  train_models_to_folder(
      d, 
      algo, 
      n_run, 
      n_folds, 
      folder_models
    )
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)... 
## 
## The variable importance for all the models are combined (averaged)...
folder_models %>%
  dir_ls() %>%
  path_file() %>% 
  head() %>%
  as.data.frame()
"Number of trained species: " %>%
  paste0( 
    folder_models %>%
      dir_ls() %>% 
      length()
  ) %>%
  print()
## [1] "Number of trained species: 33"

How many models failed?

d %>% 
  model_failures(folder_models)
## [1] 0

7.5 Threshold visualization

spp_names <- colnames(df_species)

thresholds_models <- spp_names %>% 
  sp_thresh_from_folder(folder_models, thresh_criteria)

thresholds_models_means <- spp_names %>%  
  sp_thresh_mean_from_folder(folder_models, thresholds_models)

thresholds_models_means  
## $astrnts_cl
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 astrnts_cl mda    0.479
## 2 astrnts_cl rbf    0.431
## 3 astrnts_cl svm    0.458
## 
## $achnptrch_
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 achnptrch_ mda    0.489
## 2 achnptrch_ rbf    0.320
## 3 achnptrch_ svm    0.476
## 
## $brycn_mznc
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 brycn_mznc mda    0.486
## 2 brycn_mznc rbf    0.724
## 3 brycn_mznc svm    0.511
## 
## $brycn_flct
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 brycn_flct mda    0.439
## 2 brycn_flct rbf    0.363
## 3 brycn_flct svm    0.440
## 
## $brycn_mlnp
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 brycn_mlnp mda    0.512
## 2 brycn_mlnp rbf    0.530
## 3 brycn_mlnp svm    0.498
## 
## $clphyss_mc
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 clphyss_mc mda    0.464
## 2 clphyss_mc rbf    0.452
## 3 clphyss_mc svm    0.448
## 
## $chlcs_ryth
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 chlcs_ryth mda    0.539
## 2 chlcs_ryth rbf    0.733
## 3 chlcs_ryth svm    0.527
## 
## $clssm_mcrp
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 clssm_mcrp mda    0.487
## 2 clssm_mcrp rbf    0.417
## 3 clssm_mcrp svm    0.515
## 
## $hmds_mmclt
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 hmds_mmclt mda    0.458
## 2 hmds_mmclt rbf    0.220
## 3 hmds_mmclt svm    0.524
## 
## $hplrythrn_
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 hplrythrn_ mda    0.443
## 2 hplrythrn_ rbf    0.410
## 3 hplrythrn_ svm    0.452
## 
## $lprns_gssz
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 lprns_gssz mda    0.528
## 2 lprns_gssz rbf    0.512
## 3 lprns_gssz svm    0.451
## 
## $lprns_brnn
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 lprns_brnn mda    0.498
## 2 lprns_brnn rbf    0.516
## 3 lprns_brnn svm    0.463
## 
## $lprns_fsct
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 lprns_fsct mda    0.450
## 2 lprns_fsct rbf    0.462
## 3 lprns_fsct svm    0.480
## 
## $lprns_frdr
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 lprns_frdr mda    0.444
## 2 lprns_frdr rbf    0.404
## 3 lprns_frdr svm    0.440
## 
## $lprns_klsw
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 lprns_klsw mda    0.427
## 2 lprns_klsw rbf    0.449
## 3 lprns_klsw svm    0.470
## 
## $lthdrs_drs
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 lthdrs_drs mda    0.507
## 2 lthdrs_drs rbf    1.95 
## 3 lthdrs_drs svm    0.559
## 
## $mglprns_tr
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 mglprns_tr mda    0.474
## 2 mglprns_tr rbf    0.373
## 3 mglprns_tr svm    0.472
## 
## $mtynns_hyp
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 mtynns_hyp mda    0.449
## 2 mtynns_hyp rbf    0.355
## 3 mtynns_hyp svm    0.419
## 
## $mylpls_str
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 mylpls_str mda    0.457
## 2 mylpls_str rbf    0.376
## 3 mylpls_str svm    0.465
## 
## $mylpls_rbr
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 mylpls_rbr mda    0.447
## 2 mylpls_rbr rbf    0.444
## 3 mylpls_rbr svm    0.461
## 
## $mylpls_trq
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 mylpls_trq mda    0.529
## 2 mylpls_trq rbf    0.374
## 3 mylpls_trq svm    0.531
## 
## $mylossm_rm
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 mylossm_rm mda    0.482
## 2 mylossm_rm rbf    0.419
## 3 mylossm_rm svm    0.521
## 
## $oxydrs_ngr
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 oxydrs_ngr mda    0.465
## 2 oxydrs_ngr rbf    0.429
## 3 oxydrs_ngr svm    0.514
## 
## $phrctcphl_
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 phrctcphl_ mda    0.458
## 2 phrctcphl_ rbf    0.482
## 3 phrctcphl_ svm    0.516
## 
## $prcts_brch
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 prcts_brch mda    0.488
## 2 prcts_brch rbf    0.442
## 3 prcts_brch svm    0.514
## 
## $pmlds_blch
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 pmlds_blch mda    0.443
## 2 pmlds_blch rbf    0.472
## 3 pmlds_blch svm    0.528
## 
## $pltynmtch_
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 pltynmtch_ mda    0.464
## 2 pltynmtch_ rbf    0.551
## 3 pltynmtch_ svm    0.415
## 
## $ptrdrs_grn
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 ptrdrs_grn mda    0.532
## 2 ptrdrs_grn rbf    0.479
## 3 ptrdrs_grn svm    0.536
## 
## $schzdn_fsc
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 schzdn_fsc mda    0.475
## 2 schzdn_fsc rbf    0.598
## 3 schzdn_fsc svm    0.491
## 
## $srrslms_gl
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 srrslms_gl mda    0.493
## 2 srrslms_gl rbf    4.62 
## 3 srrslms_gl svm    0.502
## 
## $srrslms_sp
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 srrslms_sp mda    0.483
## 2 srrslms_sp rbf    0.592
## 3 srrslms_sp svm    0.512
## 
## $trprths_lb
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 trprths_lb mda    0.421
## 2 trprths_lb rbf    0.422
## 3 trprths_lb svm    0.465
## 
## $trprths_ng
## # A tibble: 3 × 3
## # Rowwise:  species, method
##   species    method  mean
##   <chr>      <chr>  <dbl>
## 1 trprths_ng mda    0.453
## 2 trprths_ng rbf    0.432
## 3 trprths_ng svm    0.431

8 Projections

To project our models in space, we need to set where the models were saved (previously set as the folder_models object) and where we want to save our projections.

directory_projections <- here("output_data/projections")

Set up some variables.

df_pa <- shp_matrix_pa %>% 
  as.data.frame() %>%
  select(-c('geometry'))

df_potential_predictors <- df_pa %>% 
  bind_cols(df_var_preditors)

projection_data <- lapply(grid_future, function(x){ x <- as.data.frame(x)
                                           x[!(names(x) %in% c("x_centroid", "y_centroid", "geometry"))]})
projection_data$current <- df_var_preditors

And finally run our projections.

# Project models in scenarios
df_pa %>% predict_to_folder(
      projection_data,
      folder_models, 
      grid_study_area,
      algo, 
      thresh_criteria, 
      directory_projections,
      thresholds_models_means
    )

9 Visualizing Results

9.1 Obtain predictions

consensus <- function(spp_names){lapply(spp_names, function(y){
  sapply(dir_ls(glob = paste0("*", y, "*pa.csv"), recurse = T) %>% purrr::map(vroom, show_col_types = FALSE), 
         function(x){data.frame(c=x$consensus)}, 
         simplify = T) %>% 
  as.data.frame() %>%
  rowSums()}) %>%
  as.data.frame() %>%
  rowSums()}

consensus_vals <- consensus(spp_names)

9.2 Presence/Absence Consensus for Colossoma.macropomum in current scenario

p99 <- consensus_vals %>% quantile(probs=0.99)
p95 <- consensus_vals %>% quantile(probs=0.95)
ensemble_map(p99, grid_study_area)
## Warning in data.frame(...): row names were found from a short variable and have
## been discarded

ensemble_map(p95, grid_study_area)
## Warning in data.frame(...): row names were found from a short variable and have
## been discarded

ensemble_map(consensus_vals, grid_study_area)